Prediction with Nexa.

This is a very simple example that extracts the code vectors (features) from nexa and uses these to make predictions with nexa.


In [56]:
import numpy as np
import h5py
from sklearn import svm, cross_validation
from sklearn.naive_bayes import MultinomialNB

Information extraction


In [57]:
# First we load the file 
file_location = '../results_database/text_wall_street.hdf5'
run_name = '/low-resolution'

In [58]:
# Nexa parameters
Nspatial_clusters = 5
Ntime_clusters = 15
Nembedding = 3

parameters_string = '/' + str(Nspatial_clusters)
parameters_string += '-' + str(Ntime_clusters)
parameters_string += '-' + str(Nembedding)

f = h5py.File(file_location, 'r')
nexa = f[run_name + parameters_string]

In [59]:
# Now we extract the time and the code vectors
time = nexa['time']
code_vectors = nexa['code-vectors']
code_vectors_distance = nexa['code-vectors-distance']
code_vectors_softmax = nexa['code-vectors-softmax']
code_vectors_winner = nexa['code-vectors-winner']

In [60]:
# Now we need to get the letters and align them
text_directory = '../data/wall_street_letters.npy'
letters_sequence = np.load(text_directory)
Nletters = len(letters_sequence)
symbols = set(letters_sequence)

Predictions


In [61]:
# Parameters
N = 5000 
delay = 5

First let's use winner takes all


In [62]:
# Make prediction with scikit-learn
X = code_vectors_winner[:(N-delay)]
y = letters_sequence[delay:N]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

In [63]:
clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) * 100
print('SVM linear score', score)

clf_rbf = svm.SVC(C=1.0, cache_size=200, kernel='rbf')
clf_rbf.fit(X_train, y_train)
score = clf_rbf.score(X_test, y_test) * 100
print('SVM RBF score', score)

clf_b = MultinomialNB()
clf_b.fit(X_train, y_train)
score = clf_b.score(X_test, y_test) * 100
print('Multinomial score', score)


SVM linear score 74.6
SVM RBF score 69.4
Multinomial score 72.4

Now let's use softmax


In [64]:
# Make prediction with scikit-learn
X = code_vectors_softmax[:(N-delay)]
y = letters_sequence[delay:N]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

In [65]:
clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) * 100
print('SVM linear score', score)

clf_rbf = svm.SVC(C=1.0, cache_size=200, kernel='rbf')
clf_rbf.fit(X_train, y_train)
score = clf_rbf.score(X_test, y_test) * 100
print('SVM RBF score', score)

clf_b = MultinomialNB()
clf_b.fit(X_train, y_train)
score = clf_b.score(X_test, y_test) * 100
print('Multinomial score', score)


SVM linear score 76.4
SVM RBF score 71.2
Multinomial score 73.8

Let's use a normal representation (not binary)


In [66]:
# Make prediction with scikit-learn
X = code_vectors[:(N-delay)]
y = letters_sequence[delay:N]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

In [67]:
clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) * 100
print('SVM linear score', score)

clf_rbf = svm.SVC(C=1.0, cache_size=200, kernel='rbf')
clf_rbf.fit(X_train, y_train)
score = clf_rbf.score(X_test, y_test) * 100
print('SVM RBF score', score)

clf_b = MultinomialNB()
clf_b.fit(X_train, y_train)
score = clf_b.score(X_test, y_test) * 100
print('Multinomial score', score)


SVM linear score 33.6
SVM RBF score 65.2
Multinomial score 25.0

Now we move on and use distance (This sometimes does not converge!, this seems dodgy)


In [68]:
# Make prediction with scikit-learn
X = code_vectors_distance[:(N-delay)]
y = letters_sequence[delay:N]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

In [69]:
clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) * 100
print('SVM linear score', score)

clf_rbf = svm.SVC(C=1.0, cache_size=200, kernel='rbf')
clf_rbf.fit(X_train, y_train)
score = clf_rbf.score(X_test, y_test) * 100
print('SVM RBF score', score)

clf_b = MultinomialNB()
clf_b.fit(X_train, y_train)
score = clf_b.score(X_test, y_test) * 100
print('Multinomial score', score)


SVM linear score 98.6
SVM RBF score 51.8
Multinomial score 46.4

Predictions with Standarization


In [70]:
from sklearn import preprocessing

First let's use winner takes all


In [71]:
# Make prediction with scikit-learn
X = code_vectors_winner[:(N-delay)]
y = letters_sequence[delay:N]
X = preprocessing.scale(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) * 100
print('SVM linear score', score)

clf_rbf = svm.SVC(C=1.0, cache_size=200, kernel='rbf')
clf_rbf.fit(X_train, y_train)
score = clf_rbf.score(X_test, y_test) * 100
print('SVM RBF score', score)


SVM linear score 75.6
SVM RBF score 80.4

Softmax


In [72]:
# Make prediction with scikit-learn
X = code_vectors_softmax[:(N-delay)]
y = letters_sequence[delay:N]
X = preprocessing.scale(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) * 100
print('SVM linear score', score)

clf_rbf = svm.SVC(C=1.0, cache_size=200, kernel='rbf')
clf_rbf.fit(X_train, y_train)
score = clf_rbf.score(X_test, y_test) * 100
print('SVM RBF score', score)


SVM linear score 79.4
SVM RBF score 80.6

Normal representations


In [73]:
# Make prediction with scikit-learn
X = code_vectors_distance[:(N-delay)]
y = letters_sequence[delay:N]
X = preprocessing.scale(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) * 100
print('SVM linear score', score)

clf_rbf = svm.SVC(C=1.0, cache_size=200, kernel='rbf')
clf_rbf.fit(X_train, y_train)
score = clf_rbf.score(X_test, y_test) * 100
print('SVM RBF score', score)


SVM linear score 99.2
SVM RBF score 90.8

Now we move on and use distance (This sometimes does not converge!, this seems dodgy)


In [74]:
# Make prediction with scikit-learn
X = code_vectors_distance[:(N-delay)]
y = letters_sequence[delay:N]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.10)

clf = svm.SVC(C=1.0, cache_size=200, kernel='linear')
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) * 100
print('SVM linear score', score)

clf_rbf = svm.SVC(C=1.0, cache_size=200, kernel='rbf')
clf_rbf.fit(X_train, y_train)
score = clf_rbf.score(X_test, y_test) * 100
print('SVM RBF score', score)


SVM linear score 97.8
SVM RBF score 55.8

In [ ]: